1 package org.apache.lucene.codecs.compressing;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.Closeable;
21 import java.io.IOException;
22 import java.util.Collection;
23 import java.util.Collections;
24 import java.util.Iterator;
25 import java.util.NoSuchElementException;
26
27 import org.apache.lucene.codecs.CodecUtil;
28 import org.apache.lucene.codecs.TermVectorsReader;
29 import org.apache.lucene.index.CorruptIndexException;
30 import org.apache.lucene.index.DocsAndPositionsEnum;
31 import org.apache.lucene.index.IndexOptions;
32 import org.apache.lucene.index.PostingsEnum;
33 import org.apache.lucene.index.FieldInfo;
34 import org.apache.lucene.index.FieldInfos;
35 import org.apache.lucene.index.Fields;
36 import org.apache.lucene.index.IndexFileNames;
37 import org.apache.lucene.index.SegmentInfo;
38 import org.apache.lucene.index.Terms;
39 import org.apache.lucene.index.TermsEnum;
40 import org.apache.lucene.store.AlreadyClosedException;
41 import org.apache.lucene.store.ByteArrayDataInput;
42 import org.apache.lucene.store.ChecksumIndexInput;
43 import org.apache.lucene.store.Directory;
44 import org.apache.lucene.store.IOContext;
45 import org.apache.lucene.store.IndexInput;
46 import org.apache.lucene.util.Accountable;
47 import org.apache.lucene.util.Accountables;
48 import org.apache.lucene.util.ArrayUtil;
49 import org.apache.lucene.util.BytesRef;
50 import org.apache.lucene.util.IOUtils;
51 import org.apache.lucene.util.LongsRef;
52 import org.apache.lucene.util.packed.BlockPackedReaderIterator;
53 import org.apache.lucene.util.packed.PackedInts;
54
55 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.CODEC_SFX_DAT;
56 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.CODEC_SFX_IDX;
57 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.FLAGS_BITS;
58 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.OFFSETS;
59 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PACKED_BLOCK_SIZE;
60 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.PAYLOADS;
61 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.POSITIONS;
62 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_EXTENSION;
63 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VECTORS_INDEX_EXTENSION;
64 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_CHUNK_STATS;
65 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_CURRENT;
66 import static org.apache.lucene.codecs.compressing.CompressingTermVectorsWriter.VERSION_START;
67
68
69
70
71
72 public final class CompressingTermVectorsReader extends TermVectorsReader implements Closeable {
73
74 private final FieldInfos fieldInfos;
75 final CompressingStoredFieldsIndexReader indexReader;
76 final IndexInput vectorsStream;
77 private final int version;
78 private final int packedIntsVersion;
79 private final CompressionMode compressionMode;
80 private final Decompressor decompressor;
81 private final int chunkSize;
82 private final int numDocs;
83 private boolean closed;
84 private final BlockPackedReaderIterator reader;
85 private final long numChunks;
86 private final long numDirtyChunks;
87 private final long maxPointer;
88
89
90 private CompressingTermVectorsReader(CompressingTermVectorsReader reader) {
91 this.fieldInfos = reader.fieldInfos;
92 this.vectorsStream = reader.vectorsStream.clone();
93 this.indexReader = reader.indexReader.clone();
94 this.packedIntsVersion = reader.packedIntsVersion;
95 this.compressionMode = reader.compressionMode;
96 this.decompressor = reader.decompressor.clone();
97 this.chunkSize = reader.chunkSize;
98 this.numDocs = reader.numDocs;
99 this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
100 this.version = reader.version;
101 this.numChunks = reader.numChunks;
102 this.numDirtyChunks = reader.numDirtyChunks;
103 this.maxPointer = reader.maxPointer;
104 this.closed = false;
105 }
106
107
108 public CompressingTermVectorsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn,
109 IOContext context, String formatName, CompressionMode compressionMode) throws IOException {
110 this.compressionMode = compressionMode;
111 final String segment = si.name;
112 boolean success = false;
113 fieldInfos = fn;
114 numDocs = si.maxDoc();
115 int version = -1;
116 CompressingStoredFieldsIndexReader indexReader = null;
117
118 long maxPointer = -1;
119
120
121 final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION);
122 try (ChecksumIndexInput input = d.openChecksumInput(indexName, context)) {
123 Throwable priorE = null;
124 try {
125 final String codecNameIdx = formatName + CODEC_SFX_IDX;
126 version = CodecUtil.checkIndexHeader(input, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
127 assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == input.getFilePointer();
128 indexReader = new CompressingStoredFieldsIndexReader(input, si);
129 maxPointer = input.readVLong();
130 } catch (Throwable exception) {
131 priorE = exception;
132 } finally {
133 CodecUtil.checkFooter(input, priorE);
134 }
135 }
136
137 this.version = version;
138 this.indexReader = indexReader;
139 this.maxPointer = maxPointer;
140
141 try {
142
143 final String vectorsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, VECTORS_EXTENSION);
144 vectorsStream = d.openInput(vectorsStreamFN, context);
145 final String codecNameDat = formatName + CODEC_SFX_DAT;
146 int version2 = CodecUtil.checkIndexHeader(vectorsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix);
147 if (version != version2) {
148 throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + version2, vectorsStream);
149 }
150 assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == vectorsStream.getFilePointer();
151
152 long pos = vectorsStream.getFilePointer();
153
154 if (version >= VERSION_CHUNK_STATS) {
155 vectorsStream.seek(maxPointer);
156 numChunks = vectorsStream.readVLong();
157 numDirtyChunks = vectorsStream.readVLong();
158 if (numDirtyChunks > numChunks) {
159 throw new CorruptIndexException("invalid chunk counts: dirty=" + numDirtyChunks + ", total=" + numChunks, vectorsStream);
160 }
161 } else {
162 numChunks = numDirtyChunks = -1;
163 }
164
165
166
167
168
169 CodecUtil.retrieveChecksum(vectorsStream);
170 vectorsStream.seek(pos);
171
172 packedIntsVersion = vectorsStream.readVInt();
173 chunkSize = vectorsStream.readVInt();
174 decompressor = compressionMode.newDecompressor();
175 this.reader = new BlockPackedReaderIterator(vectorsStream, packedIntsVersion, PACKED_BLOCK_SIZE, 0);
176
177 success = true;
178 } finally {
179 if (!success) {
180 IOUtils.closeWhileHandlingException(this);
181 }
182 }
183 }
184
185 CompressionMode getCompressionMode() {
186 return compressionMode;
187 }
188
189 int getChunkSize() {
190 return chunkSize;
191 }
192
193 int getPackedIntsVersion() {
194 return packedIntsVersion;
195 }
196
197 int getVersion() {
198 return version;
199 }
200
201 CompressingStoredFieldsIndexReader getIndexReader() {
202 return indexReader;
203 }
204
205 IndexInput getVectorsStream() {
206 return vectorsStream;
207 }
208
209 long getMaxPointer() {
210 return maxPointer;
211 }
212
213 long getNumChunks() {
214 return numChunks;
215 }
216
217 long getNumDirtyChunks() {
218 return numDirtyChunks;
219 }
220
221
222
223
224 private void ensureOpen() throws AlreadyClosedException {
225 if (closed) {
226 throw new AlreadyClosedException("this FieldsReader is closed");
227 }
228 }
229
230 @Override
231 public void close() throws IOException {
232 if (!closed) {
233 IOUtils.close(vectorsStream);
234 closed = true;
235 }
236 }
237
238 @Override
239 public TermVectorsReader clone() {
240 return new CompressingTermVectorsReader(this);
241 }
242
243 @Override
244 public Fields get(int doc) throws IOException {
245 ensureOpen();
246
247
248 {
249 final long startPointer = indexReader.getStartPointer(doc);
250 vectorsStream.seek(startPointer);
251 }
252
253
254
255
256 final int docBase = vectorsStream.readVInt();
257 final int chunkDocs = vectorsStream.readVInt();
258 if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs) {
259 throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc, vectorsStream);
260 }
261
262 final int skip;
263 final int numFields;
264 final int totalFields;
265 if (chunkDocs == 1) {
266 skip = 0;
267 numFields = totalFields = vectorsStream.readVInt();
268 } else {
269 reader.reset(vectorsStream, chunkDocs);
270 int sum = 0;
271 for (int i = docBase; i < doc; ++i) {
272 sum += reader.next();
273 }
274 skip = sum;
275 numFields = (int) reader.next();
276 sum += numFields;
277 for (int i = doc + 1; i < docBase + chunkDocs; ++i) {
278 sum += reader.next();
279 }
280 totalFields = sum;
281 }
282
283 if (numFields == 0) {
284
285 return null;
286 }
287
288
289 final int[] fieldNums;
290 {
291 final int token = vectorsStream.readByte() & 0xFF;
292 assert token != 0;
293 final int bitsPerFieldNum = token & 0x1F;
294 int totalDistinctFields = token >>> 5;
295 if (totalDistinctFields == 0x07) {
296 totalDistinctFields += vectorsStream.readVInt();
297 }
298 ++totalDistinctFields;
299 final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalDistinctFields, bitsPerFieldNum, 1);
300 fieldNums = new int[totalDistinctFields];
301 for (int i = 0; i < totalDistinctFields; ++i) {
302 fieldNums[i] = (int) it.next();
303 }
304 }
305
306
307 final int[] fieldNumOffs = new int[numFields];
308 final PackedInts.Reader flags;
309 {
310 final int bitsPerOff = PackedInts.bitsRequired(fieldNums.length - 1);
311 final PackedInts.Reader allFieldNumOffs = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff);
312 switch (vectorsStream.readVInt()) {
313 case 0:
314 final PackedInts.Reader fieldFlags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, fieldNums.length, FLAGS_BITS);
315 PackedInts.Mutable f = PackedInts.getMutable(totalFields, FLAGS_BITS, PackedInts.COMPACT);
316 for (int i = 0; i < totalFields; ++i) {
317 final int fieldNumOff = (int) allFieldNumOffs.get(i);
318 assert fieldNumOff >= 0 && fieldNumOff < fieldNums.length;
319 final int fgs = (int) fieldFlags.get(fieldNumOff);
320 f.set(i, fgs);
321 }
322 flags = f;
323 break;
324 case 1:
325 flags = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, FLAGS_BITS);
326 break;
327 default:
328 throw new AssertionError();
329 }
330 for (int i = 0; i < numFields; ++i) {
331 fieldNumOffs[i] = (int) allFieldNumOffs.get(skip + i);
332 }
333 }
334
335
336 final PackedInts.Reader numTerms;
337 final int totalTerms;
338 {
339 final int bitsRequired = vectorsStream.readVInt();
340 numTerms = PackedInts.getReaderNoHeader(vectorsStream, PackedInts.Format.PACKED, packedIntsVersion, totalFields, bitsRequired);
341 int sum = 0;
342 for (int i = 0; i < totalFields; ++i) {
343 sum += numTerms.get(i);
344 }
345 totalTerms = sum;
346 }
347
348
349 int docOff = 0, docLen = 0, totalLen;
350 final int[] fieldLengths = new int[numFields];
351 final int[][] prefixLengths = new int[numFields][];
352 final int[][] suffixLengths = new int[numFields][];
353 {
354 reader.reset(vectorsStream, totalTerms);
355
356 int toSkip = 0;
357 for (int i = 0; i < skip; ++i) {
358 toSkip += numTerms.get(i);
359 }
360 reader.skip(toSkip);
361
362 for (int i = 0; i < numFields; ++i) {
363 final int termCount = (int) numTerms.get(skip + i);
364 final int[] fieldPrefixLengths = new int[termCount];
365 prefixLengths[i] = fieldPrefixLengths;
366 for (int j = 0; j < termCount; ) {
367 final LongsRef next = reader.next(termCount - j);
368 for (int k = 0; k < next.length; ++k) {
369 fieldPrefixLengths[j++] = (int) next.longs[next.offset + k];
370 }
371 }
372 }
373 reader.skip(totalTerms - reader.ord());
374
375 reader.reset(vectorsStream, totalTerms);
376
377 toSkip = 0;
378 for (int i = 0; i < skip; ++i) {
379 for (int j = 0; j < numTerms.get(i); ++j) {
380 docOff += reader.next();
381 }
382 }
383 for (int i = 0; i < numFields; ++i) {
384 final int termCount = (int) numTerms.get(skip + i);
385 final int[] fieldSuffixLengths = new int[termCount];
386 suffixLengths[i] = fieldSuffixLengths;
387 for (int j = 0; j < termCount; ) {
388 final LongsRef next = reader.next(termCount - j);
389 for (int k = 0; k < next.length; ++k) {
390 fieldSuffixLengths[j++] = (int) next.longs[next.offset + k];
391 }
392 }
393 fieldLengths[i] = sum(suffixLengths[i]);
394 docLen += fieldLengths[i];
395 }
396 totalLen = docOff + docLen;
397 for (int i = skip + numFields; i < totalFields; ++i) {
398 for (int j = 0; j < numTerms.get(i); ++j) {
399 totalLen += reader.next();
400 }
401 }
402 }
403
404
405 final int[] termFreqs = new int[totalTerms];
406 {
407 reader.reset(vectorsStream, totalTerms);
408 for (int i = 0; i < totalTerms; ) {
409 final LongsRef next = reader.next(totalTerms - i);
410 for (int k = 0; k < next.length; ++k) {
411 termFreqs[i++] = 1 + (int) next.longs[next.offset + k];
412 }
413 }
414 }
415
416
417 int totalPositions = 0, totalOffsets = 0, totalPayloads = 0;
418 for (int i = 0, termIndex = 0; i < totalFields; ++i) {
419 final int f = (int) flags.get(i);
420 final int termCount = (int) numTerms.get(i);
421 for (int j = 0; j < termCount; ++j) {
422 final int freq = termFreqs[termIndex++];
423 if ((f & POSITIONS) != 0) {
424 totalPositions += freq;
425 }
426 if ((f & OFFSETS) != 0) {
427 totalOffsets += freq;
428 }
429 if ((f & PAYLOADS) != 0) {
430 totalPayloads += freq;
431 }
432 }
433 assert i != totalFields - 1 || termIndex == totalTerms : termIndex + " " + totalTerms;
434 }
435
436 final int[][] positionIndex = positionIndex(skip, numFields, numTerms, termFreqs);
437 final int[][] positions, startOffsets, lengths;
438 if (totalPositions > 0) {
439 positions = readPositions(skip, numFields, flags, numTerms, termFreqs, POSITIONS, totalPositions, positionIndex);
440 } else {
441 positions = new int[numFields][];
442 }
443
444 if (totalOffsets > 0) {
445
446 final float[] charsPerTerm = new float[fieldNums.length];
447 for (int i = 0; i < charsPerTerm.length; ++i) {
448 charsPerTerm[i] = Float.intBitsToFloat(vectorsStream.readInt());
449 }
450 startOffsets = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
451 lengths = readPositions(skip, numFields, flags, numTerms, termFreqs, OFFSETS, totalOffsets, positionIndex);
452
453 for (int i = 0; i < numFields; ++i) {
454 final int[] fStartOffsets = startOffsets[i];
455 final int[] fPositions = positions[i];
456
457 if (fStartOffsets != null && fPositions != null) {
458 final float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]];
459 for (int j = 0; j < startOffsets[i].length; ++j) {
460 fStartOffsets[j] += (int) (fieldCharsPerTerm * fPositions[j]);
461 }
462 }
463 if (fStartOffsets != null) {
464 final int[] fPrefixLengths = prefixLengths[i];
465 final int[] fSuffixLengths = suffixLengths[i];
466 final int[] fLengths = lengths[i];
467 for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
468
469 final int termLength = fPrefixLengths[j] + fSuffixLengths[j];
470 lengths[i][positionIndex[i][j]] += termLength;
471 for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k) {
472 fStartOffsets[k] += fStartOffsets[k - 1];
473 fLengths[k] += termLength;
474 }
475 }
476 }
477 }
478 } else {
479 startOffsets = lengths = new int[numFields][];
480 }
481 if (totalPositions > 0) {
482
483 for (int i = 0; i < numFields; ++i) {
484 final int[] fPositions = positions[i];
485 final int[] fpositionIndex = positionIndex[i];
486 if (fPositions != null) {
487 for (int j = 0, end = (int) numTerms.get(skip + i); j < end; ++j) {
488
489 for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k) {
490 fPositions[k] += fPositions[k - 1];
491 }
492 }
493 }
494 }
495 }
496
497
498 final int[][] payloadIndex = new int[numFields][];
499 int totalPayloadLength = 0;
500 int payloadOff = 0;
501 int payloadLen = 0;
502 if (totalPayloads > 0) {
503 reader.reset(vectorsStream, totalPayloads);
504
505 int termIndex = 0;
506 for (int i = 0; i < skip; ++i) {
507 final int f = (int) flags.get(i);
508 final int termCount = (int) numTerms.get(i);
509 if ((f & PAYLOADS) != 0) {
510 for (int j = 0; j < termCount; ++j) {
511 final int freq = termFreqs[termIndex + j];
512 for (int k = 0; k < freq; ++k) {
513 final int l = (int) reader.next();
514 payloadOff += l;
515 }
516 }
517 }
518 termIndex += termCount;
519 }
520 totalPayloadLength = payloadOff;
521
522 for (int i = 0; i < numFields; ++i) {
523 final int f = (int) flags.get(skip + i);
524 final int termCount = (int) numTerms.get(skip + i);
525 if ((f & PAYLOADS) != 0) {
526 final int totalFreq = positionIndex[i][termCount];
527 payloadIndex[i] = new int[totalFreq + 1];
528 int posIdx = 0;
529 payloadIndex[i][posIdx] = payloadLen;
530 for (int j = 0; j < termCount; ++j) {
531 final int freq = termFreqs[termIndex + j];
532 for (int k = 0; k < freq; ++k) {
533 final int payloadLength = (int) reader.next();
534 payloadLen += payloadLength;
535 payloadIndex[i][posIdx+1] = payloadLen;
536 ++posIdx;
537 }
538 }
539 assert posIdx == totalFreq;
540 }
541 termIndex += termCount;
542 }
543 totalPayloadLength += payloadLen;
544 for (int i = skip + numFields; i < totalFields; ++i) {
545 final int f = (int) flags.get(i);
546 final int termCount = (int) numTerms.get(i);
547 if ((f & PAYLOADS) != 0) {
548 for (int j = 0; j < termCount; ++j) {
549 final int freq = termFreqs[termIndex + j];
550 for (int k = 0; k < freq; ++k) {
551 totalPayloadLength += reader.next();
552 }
553 }
554 }
555 termIndex += termCount;
556 }
557 assert termIndex == totalTerms : termIndex + " " + totalTerms;
558 }
559
560
561 final BytesRef suffixBytes = new BytesRef();
562 decompressor.decompress(vectorsStream, totalLen + totalPayloadLength, docOff + payloadOff, docLen + payloadLen, suffixBytes);
563 suffixBytes.length = docLen;
564 final BytesRef payloadBytes = new BytesRef(suffixBytes.bytes, suffixBytes.offset + docLen, payloadLen);
565
566 final int[] fieldFlags = new int[numFields];
567 for (int i = 0; i < numFields; ++i) {
568 fieldFlags[i] = (int) flags.get(skip + i);
569 }
570
571 final int[] fieldNumTerms = new int[numFields];
572 for (int i = 0; i < numFields; ++i) {
573 fieldNumTerms[i] = (int) numTerms.get(skip + i);
574 }
575
576 final int[][] fieldTermFreqs = new int[numFields][];
577 {
578 int termIdx = 0;
579 for (int i = 0; i < skip; ++i) {
580 termIdx += numTerms.get(i);
581 }
582 for (int i = 0; i < numFields; ++i) {
583 final int termCount = (int) numTerms.get(skip + i);
584 fieldTermFreqs[i] = new int[termCount];
585 for (int j = 0; j < termCount; ++j) {
586 fieldTermFreqs[i][j] = termFreqs[termIdx++];
587 }
588 }
589 }
590
591 assert sum(fieldLengths) == docLen : sum(fieldLengths) + " != " + docLen;
592
593 return new TVFields(fieldNums, fieldFlags, fieldNumOffs, fieldNumTerms, fieldLengths,
594 prefixLengths, suffixLengths, fieldTermFreqs,
595 positionIndex, positions, startOffsets, lengths,
596 payloadBytes, payloadIndex,
597 suffixBytes);
598 }
599
600
601 private int[][] positionIndex(int skip, int numFields, PackedInts.Reader numTerms, int[] termFreqs) {
602 final int[][] positionIndex = new int[numFields][];
603 int termIndex = 0;
604 for (int i = 0; i < skip; ++i) {
605 final int termCount = (int) numTerms.get(i);
606 termIndex += termCount;
607 }
608 for (int i = 0; i < numFields; ++i) {
609 final int termCount = (int) numTerms.get(skip + i);
610 positionIndex[i] = new int[termCount + 1];
611 for (int j = 0; j < termCount; ++j) {
612 final int freq = termFreqs[termIndex+j];
613 positionIndex[i][j + 1] = positionIndex[i][j] + freq;
614 }
615 termIndex += termCount;
616 }
617 return positionIndex;
618 }
619
620 private int[][] readPositions(int skip, int numFields, PackedInts.Reader flags, PackedInts.Reader numTerms, int[] termFreqs, int flag, final int totalPositions, int[][] positionIndex) throws IOException {
621 final int[][] positions = new int[numFields][];
622 reader.reset(vectorsStream, totalPositions);
623
624 int toSkip = 0;
625 int termIndex = 0;
626 for (int i = 0; i < skip; ++i) {
627 final int f = (int) flags.get(i);
628 final int termCount = (int) numTerms.get(i);
629 if ((f & flag) != 0) {
630 for (int j = 0; j < termCount; ++j) {
631 final int freq = termFreqs[termIndex+j];
632 toSkip += freq;
633 }
634 }
635 termIndex += termCount;
636 }
637 reader.skip(toSkip);
638
639 for (int i = 0; i < numFields; ++i) {
640 final int f = (int) flags.get(skip + i);
641 final int termCount = (int) numTerms.get(skip + i);
642 if ((f & flag) != 0) {
643 final int totalFreq = positionIndex[i][termCount];
644 final int[] fieldPositions = new int[totalFreq];
645 positions[i] = fieldPositions;
646 for (int j = 0; j < totalFreq; ) {
647 final LongsRef nextPositions = reader.next(totalFreq - j);
648 for (int k = 0; k < nextPositions.length; ++k) {
649 fieldPositions[j++] = (int) nextPositions.longs[nextPositions.offset + k];
650 }
651 }
652 }
653 termIndex += termCount;
654 }
655 reader.skip(totalPositions - reader.ord());
656 return positions;
657 }
658
659 private class TVFields extends Fields {
660
661 private final int[] fieldNums, fieldFlags, fieldNumOffs, numTerms, fieldLengths;
662 private final int[][] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
663 private final BytesRef suffixBytes, payloadBytes;
664
665 public TVFields(int[] fieldNums, int[] fieldFlags, int[] fieldNumOffs, int[] numTerms, int[] fieldLengths,
666 int[][] prefixLengths, int[][] suffixLengths, int[][] termFreqs,
667 int[][] positionIndex, int[][] positions, int[][] startOffsets, int[][] lengths,
668 BytesRef payloadBytes, int[][] payloadIndex,
669 BytesRef suffixBytes) {
670 this.fieldNums = fieldNums;
671 this.fieldFlags = fieldFlags;
672 this.fieldNumOffs = fieldNumOffs;
673 this.numTerms = numTerms;
674 this.fieldLengths = fieldLengths;
675 this.prefixLengths = prefixLengths;
676 this.suffixLengths = suffixLengths;
677 this.termFreqs = termFreqs;
678 this.positionIndex = positionIndex;
679 this.positions = positions;
680 this.startOffsets = startOffsets;
681 this.lengths = lengths;
682 this.payloadBytes = payloadBytes;
683 this.payloadIndex = payloadIndex;
684 this.suffixBytes = suffixBytes;
685 }
686
687 @Override
688 public Iterator<String> iterator() {
689 return new Iterator<String>() {
690 int i = 0;
691 @Override
692 public boolean hasNext() {
693 return i < fieldNumOffs.length;
694 }
695 @Override
696 public String next() {
697 if (!hasNext()) {
698 throw new NoSuchElementException();
699 }
700 final int fieldNum = fieldNums[fieldNumOffs[i++]];
701 return fieldInfos.fieldInfo(fieldNum).name;
702 }
703 @Override
704 public void remove() {
705 throw new UnsupportedOperationException();
706 }
707 };
708 }
709
710 @Override
711 public Terms terms(String field) throws IOException {
712 final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
713 if (fieldInfo == null) {
714 return null;
715 }
716 int idx = -1;
717 for (int i = 0; i < fieldNumOffs.length; ++i) {
718 if (fieldNums[fieldNumOffs[i]] == fieldInfo.number) {
719 idx = i;
720 break;
721 }
722 }
723
724 if (idx == -1 || numTerms[idx] == 0) {
725
726 return null;
727 }
728 int fieldOff = 0, fieldLen = -1;
729 for (int i = 0; i < fieldNumOffs.length; ++i) {
730 if (i < idx) {
731 fieldOff += fieldLengths[i];
732 } else {
733 fieldLen = fieldLengths[i];
734 break;
735 }
736 }
737 assert fieldLen >= 0;
738 return new TVTerms(numTerms[idx], fieldFlags[idx],
739 prefixLengths[idx], suffixLengths[idx], termFreqs[idx],
740 positionIndex[idx], positions[idx], startOffsets[idx], lengths[idx],
741 payloadIndex[idx], payloadBytes,
742 new BytesRef(suffixBytes.bytes, suffixBytes.offset + fieldOff, fieldLen));
743 }
744
745 @Override
746 public int size() {
747 return fieldNumOffs.length;
748 }
749
750 }
751
752 private class TVTerms extends Terms {
753
754 private final int numTerms, flags;
755 private final int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
756 private final BytesRef termBytes, payloadBytes;
757
758 TVTerms(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs,
759 int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
760 int[] payloadIndex, BytesRef payloadBytes,
761 BytesRef termBytes) {
762 this.numTerms = numTerms;
763 this.flags = flags;
764 this.prefixLengths = prefixLengths;
765 this.suffixLengths = suffixLengths;
766 this.termFreqs = termFreqs;
767 this.positionIndex = positionIndex;
768 this.positions = positions;
769 this.startOffsets = startOffsets;
770 this.lengths = lengths;
771 this.payloadIndex = payloadIndex;
772 this.payloadBytes = payloadBytes;
773 this.termBytes = termBytes;
774 }
775
776 @Override
777 public TermsEnum iterator() throws IOException {
778 TVTermsEnum termsEnum = new TVTermsEnum();
779 termsEnum.reset(numTerms, flags, prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths,
780 payloadIndex, payloadBytes,
781 new ByteArrayDataInput(termBytes.bytes, termBytes.offset, termBytes.length));
782 return termsEnum;
783 }
784
785 @Override
786 public long size() throws IOException {
787 return numTerms;
788 }
789
790 @Override
791 public long getSumTotalTermFreq() throws IOException {
792 return -1L;
793 }
794
795 @Override
796 public long getSumDocFreq() throws IOException {
797 return numTerms;
798 }
799
800 @Override
801 public int getDocCount() throws IOException {
802 return 1;
803 }
804
805 @Override
806 public boolean hasFreqs() {
807 return true;
808 }
809
810 @Override
811 public boolean hasOffsets() {
812 return (flags & OFFSETS) != 0;
813 }
814
815 @Override
816 public boolean hasPositions() {
817 return (flags & POSITIONS) != 0;
818 }
819
820 @Override
821 public boolean hasPayloads() {
822 return (flags & PAYLOADS) != 0;
823 }
824
825 }
826
827 private static class TVTermsEnum extends TermsEnum {
828
829 private int numTerms, startPos, ord;
830 private int[] prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths, payloadIndex;
831 private ByteArrayDataInput in;
832 private BytesRef payloads;
833 private final BytesRef term;
834
835 private TVTermsEnum() {
836 term = new BytesRef(16);
837 }
838
839 void reset(int numTerms, int flags, int[] prefixLengths, int[] suffixLengths, int[] termFreqs, int[] positionIndex, int[] positions, int[] startOffsets, int[] lengths,
840 int[] payloadIndex, BytesRef payloads, ByteArrayDataInput in) {
841 this.numTerms = numTerms;
842 this.prefixLengths = prefixLengths;
843 this.suffixLengths = suffixLengths;
844 this.termFreqs = termFreqs;
845 this.positionIndex = positionIndex;
846 this.positions = positions;
847 this.startOffsets = startOffsets;
848 this.lengths = lengths;
849 this.payloadIndex = payloadIndex;
850 this.payloads = payloads;
851 this.in = in;
852 startPos = in.getPosition();
853 reset();
854 }
855
856 void reset() {
857 term.length = 0;
858 in.setPosition(startPos);
859 ord = -1;
860 }
861
862 @Override
863 public BytesRef next() throws IOException {
864 if (ord == numTerms - 1) {
865 return null;
866 } else {
867 assert ord < numTerms;
868 ++ord;
869 }
870
871
872 term.offset = 0;
873 term.length = prefixLengths[ord] + suffixLengths[ord];
874 if (term.length > term.bytes.length) {
875 term.bytes = ArrayUtil.grow(term.bytes, term.length);
876 }
877 in.readBytes(term.bytes, prefixLengths[ord], suffixLengths[ord]);
878
879 return term;
880 }
881
882 @Override
883 public SeekStatus seekCeil(BytesRef text)
884 throws IOException {
885 if (ord < numTerms && ord >= 0) {
886 final int cmp = term().compareTo(text);
887 if (cmp == 0) {
888 return SeekStatus.FOUND;
889 } else if (cmp > 0) {
890 reset();
891 }
892 }
893
894 while (true) {
895 final BytesRef term = next();
896 if (term == null) {
897 return SeekStatus.END;
898 }
899 final int cmp = term.compareTo(text);
900 if (cmp > 0) {
901 return SeekStatus.NOT_FOUND;
902 } else if (cmp == 0) {
903 return SeekStatus.FOUND;
904 }
905 }
906 }
907
908 @Override
909 public void seekExact(long ord) throws IOException {
910 throw new UnsupportedOperationException();
911 }
912
913 @Override
914 public BytesRef term() throws IOException {
915 return term;
916 }
917
918 @Override
919 public long ord() throws IOException {
920 throw new UnsupportedOperationException();
921 }
922
923 @Override
924 public int docFreq() throws IOException {
925 return 1;
926 }
927
928 @Override
929 public long totalTermFreq() throws IOException {
930 return termFreqs[ord];
931 }
932
933 @Override
934 public final PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
935 if (PostingsEnum.featureRequested(flags, DocsAndPositionsEnum.OLD_NULL_SEMANTICS)) {
936 if (positions == null && startOffsets == null) {
937
938 return null;
939 }
940 }
941
942 final TVPostingsEnum docsEnum;
943 if (reuse != null && reuse instanceof TVPostingsEnum) {
944 docsEnum = (TVPostingsEnum) reuse;
945 } else {
946 docsEnum = new TVPostingsEnum();
947 }
948
949 docsEnum.reset(termFreqs[ord], positionIndex[ord], positions, startOffsets, lengths, payloads, payloadIndex);
950 return docsEnum;
951 }
952
953 }
954
955 private static class TVPostingsEnum extends PostingsEnum {
956
957 private int doc = -1;
958 private int termFreq;
959 private int positionIndex;
960 private int[] positions;
961 private int[] startOffsets;
962 private int[] lengths;
963 private final BytesRef payload;
964 private int[] payloadIndex;
965 private int basePayloadOffset;
966 private int i;
967
968 TVPostingsEnum() {
969 payload = new BytesRef();
970 }
971
972 public void reset(int freq, int positionIndex, int[] positions,
973 int[] startOffsets, int[] lengths, BytesRef payloads,
974 int[] payloadIndex) {
975 this.termFreq = freq;
976 this.positionIndex = positionIndex;
977 this.positions = positions;
978 this.startOffsets = startOffsets;
979 this.lengths = lengths;
980 this.basePayloadOffset = payloads.offset;
981 this.payload.bytes = payloads.bytes;
982 payload.offset = payload.length = 0;
983 this.payloadIndex = payloadIndex;
984
985 doc = i = -1;
986 }
987
988 private void checkDoc() {
989 if (doc == NO_MORE_DOCS) {
990 throw new IllegalStateException("DocsEnum exhausted");
991 } else if (doc == -1) {
992 throw new IllegalStateException("DocsEnum not started");
993 }
994 }
995
996 private void checkPosition() {
997 checkDoc();
998 if (i < 0) {
999 throw new IllegalStateException("Position enum not started");
1000 } else if (i >= termFreq) {
1001 throw new IllegalStateException("Read past last position");
1002 }
1003 }
1004
1005 @Override
1006 public int nextPosition() throws IOException {
1007 if (doc != 0) {
1008 throw new IllegalStateException();
1009 } else if (i >= termFreq - 1) {
1010 throw new IllegalStateException("Read past last position");
1011 }
1012
1013 ++i;
1014
1015 if (payloadIndex != null) {
1016 payload.offset = basePayloadOffset + payloadIndex[positionIndex + i];
1017 payload.length = payloadIndex[positionIndex + i + 1] - payloadIndex[positionIndex + i];
1018 }
1019
1020 if (positions == null) {
1021 return -1;
1022 } else {
1023 return positions[positionIndex + i];
1024 }
1025 }
1026
1027 @Override
1028 public int startOffset() throws IOException {
1029 checkPosition();
1030 if (startOffsets == null) {
1031 return -1;
1032 } else {
1033 return startOffsets[positionIndex + i];
1034 }
1035 }
1036
1037 @Override
1038 public int endOffset() throws IOException {
1039 checkPosition();
1040 if (startOffsets == null) {
1041 return -1;
1042 } else {
1043 return startOffsets[positionIndex + i] + lengths[positionIndex + i];
1044 }
1045 }
1046
1047 @Override
1048 public BytesRef getPayload() throws IOException {
1049 checkPosition();
1050 if (payloadIndex == null || payload.length == 0) {
1051 return null;
1052 } else {
1053 return payload;
1054 }
1055 }
1056
1057 @Override
1058 public int freq() throws IOException {
1059 checkDoc();
1060 return termFreq;
1061 }
1062
1063 @Override
1064 public int docID() {
1065 return doc;
1066 }
1067
1068 @Override
1069 public int nextDoc() throws IOException {
1070 if (doc == -1) {
1071 return (doc = 0);
1072 } else {
1073 return (doc = NO_MORE_DOCS);
1074 }
1075 }
1076
1077 @Override
1078 public int advance(int target) throws IOException {
1079 return slowAdvance(target);
1080 }
1081
1082 @Override
1083 public long cost() {
1084 return 1;
1085 }
1086 }
1087
1088 private static int sum(int[] arr) {
1089 int sum = 0;
1090 for (int el : arr) {
1091 sum += el;
1092 }
1093 return sum;
1094 }
1095
1096 @Override
1097 public long ramBytesUsed() {
1098 return indexReader.ramBytesUsed();
1099 }
1100
1101 @Override
1102 public Collection<Accountable> getChildResources() {
1103 return Collections.singleton(Accountables.namedAccountable("term vector index", indexReader));
1104 }
1105
1106 @Override
1107 public void checkIntegrity() throws IOException {
1108 CodecUtil.checksumEntireFile(vectorsStream);
1109 }
1110
1111 @Override
1112 public String toString() {
1113 return getClass().getSimpleName() + "(mode=" + compressionMode + ",chunksize=" + chunkSize + ")";
1114 }
1115 }